knitr::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'Figs/',
                      echo = TRUE, message = FALSE, warning = FALSE)

library(tidyverse)
library(RColorBrewer)
library(stringr)

### Set up some options
options(stringsAsFactors = FALSE) ### Ensure strings come in as character types

### generic theme for all plots
ggtheme_plot <- function(base_size = 9) {
  theme(axis.ticks = element_blank(),
               text             = element_text(family = 'Helvetica', color = 'gray30', size = base_size),
               plot.title       = element_text(size = rel(1.25), hjust = 0, face = 'bold'),
               panel.background = element_blank(),
               legend.position  = 'right',
               panel.border     = element_blank(),
               panel.grid.minor = element_blank(),
               panel.grid.major = element_line(colour = 'grey90', size = .25),
               # panel.grid.major = element_blank(),
               legend.key       = element_rect(colour = NA, fill = NA),
               axis.line        = element_blank()) # element_line(colour = "grey30", size = .5))
}

1 Plot a few parameters against each other

1.1 Parameters vs temperature

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

temp_df <- z %>%
  filter(param_desc == 'WATER TEMPERATURE DEG C') %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(temp = mean(MeasureValue))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'WATER TEMPERATURE DEG C')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(temp_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, temp, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ temp, data = tmp) %>%
    summary()
  
  
  if(mdl_R$adj.r.squared > R_thresh) {
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = temp, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Temperature (°C)')
    
    print(param_plot)
  }
  cat(sprintf('<br>%s vs Temp:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))
}

WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Temp:
R2 = 0.0197

ACTIVE CHLOROPHYLL-A UG/L vs Temp:
R2 = 0.1386

DISSOLVED OXYGEN IN MG/L MG/L vs Temp:
R2 = 0.5506

HARDNESS AS CACO3 MG/L vs Temp:
R2 = 0.0063

AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.0053

NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.0918

NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.0233

NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.1051

PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Temp:
R2 = 0.0036

SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Temp:
R2 = 0.0692

SECCHI DEPTH M vs Temp:
R2 = 0.0140

TOTAL ALKALINITY AS CACO3 MG/L vs Temp:
R2 = 0.0163

TOTAL DISSOLVED NITROGEN MG/L vs Temp:
R2 = 0.0962

TOTAL DISSOLVED PHOSPHORUS MG/L vs Temp:
R2 = 0.0035

TOTAL SUSPENDED SOLIDS MG/L vs Temp:
R2 = 0.0060

TURBIDITY; NEPHELOMETRIC METHOD NTU vs Temp:
R2 = 0.0096

1.2 Parameters vs oxygen

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

o2_df <- z %>%
  filter(param_desc == 'DISSOLVED OXYGEN IN MG/L MG/L') %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(o2 = mean(MeasureValue))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'DISSOLVED OXYGEN')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(o2_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, o2, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ o2, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = o2, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Dissolved Oxygen (mg/L)')
    
    print(param_plot)
  }
  cat(sprintf('<br>%s vs Dissolved Oxygen:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}

WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Dissolved Oxygen:
R2 = 0.0243

ACTIVE CHLOROPHYLL-A UG/L vs Dissolved Oxygen:
R2 = 0.0699

HARDNESS AS CACO3 MG/L vs Dissolved Oxygen:
R2 = 0.0346

AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.0025

NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.1068

NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.0581

NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.1074

PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Dissolved Oxygen:
R2 = 0.0017

SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Dissolved Oxygen:
R2 = 0.0614

SECCHI DEPTH M vs Dissolved Oxygen:
R2 = 0.0381

TOTAL ALKALINITY AS CACO3 MG/L vs Dissolved Oxygen:
R2 = 0.0035

TOTAL DISSOLVED NITROGEN MG/L vs Dissolved Oxygen:
R2 = 0.0973

TOTAL DISSOLVED PHOSPHORUS MG/L vs Dissolved Oxygen:
R2 = -0.0014

TOTAL SUSPENDED SOLIDS MG/L vs Dissolved Oxygen:
R2 = 0.0026

TURBIDITY; NEPHELOMETRIC METHOD NTU vs Dissolved Oxygen:
R2 = -0.0000

WATER TEMPERATURE DEG C vs Dissolved Oxygen:
R2 = 0.6943

1.3 Parameters vs nitrogen

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

n2_df <- z %>%
  filter(str_detect(param_desc, 'DISSOLVED NITROGEN')) %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(n2 = mean(MeasureValue))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'DISSOLVED NITROGEN')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(n2_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, n2, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ n2, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {

    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = n2, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Dissolved Nitrogen (mg/L)')
    
    print(param_plot)
  }
  cat(sprintf('<br>%s vs Dissolved Nitrogen:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}

WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Dissolved Nitrogen:
R2 = 0.0070

ACTIVE CHLOROPHYLL-A UG/L vs Dissolved Nitrogen:
R2 = 0.0066

DISSOLVED OXYGEN IN MG/L MG/L vs Dissolved Nitrogen:
R2 = 0.1415

HARDNESS AS CACO3 MG/L vs Dissolved Nitrogen:
R2 = -0.0040

AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.0767

NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.3626

NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.0367

NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.3513

PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Dissolved Nitrogen:
R2 = 0.0198

SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Dissolved Nitrogen:
R2 = 0.0590

SECCHI DEPTH M vs Dissolved Nitrogen:
R2 = 0.0143

TOTAL ALKALINITY AS CACO3 MG/L vs Dissolved Nitrogen:
R2 = 0.0254

TOTAL DISSOLVED PHOSPHORUS MG/L vs Dissolved Nitrogen:
R2 = 0.0177

TOTAL SUSPENDED SOLIDS MG/L vs Dissolved Nitrogen:
R2 = 0.0201

TURBIDITY; NEPHELOMETRIC METHOD NTU vs Dissolved Nitrogen:
R2 = 0.0268

WATER TEMPERATURE DEG C vs Dissolved Nitrogen:
R2 = 0.1974

1.4 Parameters vs phosphorus

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

p_df <- z %>%
  filter(str_detect(param_desc, 'DISSOLVED PHOSPHORUS')) %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(p = mean(MeasureValue)) %>%
  filter(p < quantile(p, .99))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'DISSOLVED PHOSPHORUS')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(p_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, p, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ p, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {
  
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = p, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Dissolved Phosphorus (mg/L)')
    
    print(param_plot)
  }
  cat(sprintf('<br>%s vs Dissolved Phosphorus:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}

WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Dissolved Phosphorus:
R2 = 0.1360

ACTIVE CHLOROPHYLL-A UG/L vs Dissolved Phosphorus:
R2 = 0.0186

DISSOLVED OXYGEN IN MG/L MG/L vs Dissolved Phosphorus:
R2 = 0.0029

HARDNESS AS CACO3 MG/L vs Dissolved Phosphorus:
R2 = 0.0754

AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.1170

NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.0209

NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.1398

NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.0251

PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Dissolved Phosphorus:
R2 = 0.0744

SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Dissolved Phosphorus:
R2 = 0.0078

SECCHI DEPTH M vs Dissolved Phosphorus:
R2 = 0.0656

TOTAL ALKALINITY AS CACO3 MG/L vs Dissolved Phosphorus:
R2 = -0.0021

TOTAL DISSOLVED NITROGEN MG/L vs Dissolved Phosphorus:
R2 = 0.0256

TOTAL SUSPENDED SOLIDS MG/L vs Dissolved Phosphorus:
R2 = 0.1065

TURBIDITY; NEPHELOMETRIC METHOD NTU vs Dissolved Phosphorus:
R2 = 0.1935

WATER TEMPERATURE DEG C vs Dissolved Phosphorus:
R2 = 0.0136

1.5 Parameters vs turbidity

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

turb_df <- z %>%
  filter(str_detect(param_desc, 'TURBIDITY')) %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(turb = mean(MeasureValue)) %>%
  filter(turb < quantile(turb, .99))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'TURBIDITY')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(turb_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, turb, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }

  mdl_R <- lm(MeasureValue ~ turb, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {
    
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = turb, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Turbidity (NTU)')
    
    print(param_plot)
  }
  cat(sprintf('<br>%s vs Turbidity:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}

WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Turbidity:
R2 = 0.0651

ACTIVE CHLOROPHYLL-A UG/L vs Turbidity:
R2 = 0.0152

DISSOLVED OXYGEN IN MG/L MG/L vs Turbidity:
R2 = -0.0002

HARDNESS AS CACO3 MG/L vs Turbidity:
R2 = 0.0725

AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0074

NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0277

NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0030

NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0208

PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Turbidity:
R2 = 0.0385

SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Turbidity:
R2 = 0.0046

SECCHI DEPTH M vs Turbidity:
R2 = 0.1311

TOTAL ALKALINITY AS CACO3 MG/L vs Turbidity:
R2 = 0.1068

TOTAL DISSOLVED NITROGEN MG/L vs Turbidity:
R2 = 0.0257

TOTAL DISSOLVED PHOSPHORUS MG/L vs Turbidity:
R2 = 0.0502

TOTAL SUSPENDED SOLIDS MG/L vs Turbidity:
R2 = 0.3017

WATER TEMPERATURE DEG C vs Turbidity:
R2 = 0.0062